/**
 * @author 李亚松
 * @email  lysong01@gmail.com
 * 
 * RegexUtil.java 2012-12-5
 * description 
 */
package liyasong.util;

public class RegexUtil {

	public static String cleanHtml(String string) {
		String lineRegex = "(<br[ /]*>)+";
		String htmlRegex = "<[^>]*>|&nbsp;";
		return string.replaceAll(lineRegex, "\n").replaceAll(htmlRegex, "");
	}
	
	/*
	 * 这里可能有只遍历一次就完成全部替换的方法
	 */
	public static String cleanSinaBlog(String string) {
		String displayNone = ">[^>]*</span";
		String commaRegex = "&#65292;";
		String qmarkRegex = "&#65311;";
		String colonRegex = "&#65306;";
		String lbrace = "&#65288;";
		String rbrace = "&#65289;";
		String space = "&nbsp;|&#\\d{5};|\t";
		//sina博客网页源代码有很多不显示的文字内容
		String cBlog = cleanHtml(string.replaceAll(displayNone, ""));
		String cString = cBlog.replaceAll(commaRegex, "，").replaceAll(qmarkRegex, "？")
				.replaceAll(colonRegex, "：").replaceAll(lbrace, "（")
				.replaceAll(rbrace, "）").replaceAll(space, "");
		return replaceBlankLine(cString).replaceFirst("\n", "");
	}
	
	public static String replaceBlankLine(String string) {
		return string.replaceAll("\n+", "\n");
	}
	
	public static String replaceSpace(String string) {
		return string.replaceAll("&nbsp;", " ");
	}
	
	public static String cleanSpace(String string) {
		return string.replaceAll(" ", "");
	}
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub

	}

}
